mathlea_10 <- get_data("EDFacts_math_achievement_lea_2010_2019")

rlalea_10 <- get_data("EDFacts_rla_achievement_lea_2010_2019")

fiscal2010 <- get_data("NCES_CCD_fiscal_district_2010")

mathsch_10 <- get_data("EDFacts_math_achievement_sch_2010_2019")

rlasch_10 <- get_data("EDFacts_rla_achievement_sch_2010_2019")

Data Visualization 1

Rebecca

Research Question:

How do high school students’ subgroup makeup (i.e., Race/ethnicity, Male vs. Female, economically disadvantaged, Limited English, Migrant status, Disability status, and Homelessness) differ among states/regions?

Datasets:

  • EDFacts_math_achievement_sch_2010_2019
  • EDFacts_rla_achievement_sch_2010_2019
df <- full_join(mathsch_10, rlasch_10) 

df2 <- df %>% 
    mutate(ECDmath = gsub("[^0-9.-]", "", ECD_MTHHSPCTPROF),
                 ECDrla = gsub("[^0-9.-]", "", ECD_RLAHSPCTPROF),
                 LEPrla = gsub("[^0-9.-]", "", LEP_RLAHSPCTPROF),
                 LEPmath = gsub("[^0-9.-]", "", LEP_MTHHSPCTPROF),
                 HOMmath = gsub("[^0-9.-]", "", HOM_MTHHSPCTPROF),
                 HOMrla = gsub("[^0-9.-]", "", HOM_RLAHSPCTPROF),
                 Mmath = gsub("[^0-9.-]", "", M_MTHHSPCTPROF),
                 Mrla = gsub("[^0-9.-]", "", M_RLAHSPCTPROF),
                 Fmath = gsub("[^0-9.-]", "", F_MTHHSPCTPROF),
                 Frla = gsub("[^0-9.-]", "", F_RLAHSPCTPROF),
                 MBLmath = gsub("[^0-9.-]", "", MBL_MTHHSPCTPROF),
                 MBLrla = gsub("[^0-9.-]", "", MBL_RLAHSPCTPROF),
                 MHImath = gsub("[^0-9.-]", "", MHI_MTHHSPCTPROF),
                 MHIrla = gsub("[^0-9.-]", "", MHI_RLAHSPCTPROF),
                 MWHmath = gsub("[^0-9.-]", "", MWH_MTHHSPCTPROF),
                 MWHrla = gsub("[^0-9.-]", "", MWH_RLAHSPCTPROF),
                 CWDmath = gsub("[^0-9.-]", "", CWD_MTHHSPCTPROF),
                 CWDrla = gsub("[^0-9.-]", "", CWD_RLAHSPCTPROF))

df2$ECDmath = readr::parse_number(df2$ECDmath) 
df2$ECDrla = readr::parse_number(df2$ECDrla) 
df2$LEPrla = readr::parse_number(df2$LEPrla) 
df2$LEPmath = readr::parse_number(df2$LEPmath) 
df2$HOMrla = readr::parse_number(df2$HOMrla) 
df2$HOMmath = readr::parse_number(df2$HOMmath) 
df2$Mrla = readr::parse_number(df2$Mrla) 
df2$Mmath = readr::parse_number(df2$Mmath) 
df2$Frla = readr::parse_number(df2$Frla) 
df2$Fmath = readr::parse_number(df2$Fmath) 
df2$MBLrla = readr::parse_number(df2$MBLrla) 
df2$MBLmath = readr::parse_number(df2$MBLmath) 
df2$MHIrla = readr::parse_number(df2$MHIrla) 
df2$MHImath = readr::parse_number(df2$MHImath) 
df2$MWHmath = readr::parse_number(df2$MWHmath) 
df2$MWHrla = readr::parse_number(df2$MWHrla) 
df2$CWDmath = readr::parse_number(df2$CWDmath) 
df2$CWDrla = readr::parse_number(df2$CWDrla) 

dat <- df2 %>% 
    group_by(STNAM) %>% 
    summarise(
        mean_EconomicallyDisadvantaged_math = mean(ECDmath, na.rm = TRUE),
        mean_EconomicallyDisadvantaged_rla = mean(ECDrla, na.rm = TRUE),
        mean_EnglishLearner_math = mean(LEPmath, na.rm = TRUE),
        mean_EnglishLearner_rla = mean(LEPrla, na.rm = TRUE),
        mean_Homeless_math = mean(HOMmath, na.rm = TRUE),
        mean_Homeless_rla = mean(HOMrla, na.rm = TRUE),
        mean_Male_math = mean(Mmath, na.rm = TRUE),
        mean_Male_rla = mean(Mrla, na.rm = TRUE),
        mean_Female_math = mean(Fmath, na.rm = TRUE),
        mean_Female_rla = mean(Frla, na.rm = TRUE),
        mean_Black_math = mean(MBLmath, na.rm = TRUE),
        mean_Black_rla = mean(MBLrla, na.rm = TRUE),
        mean_Hispanic_math = mean(MHImath, na.rm = TRUE),
        mean_Hispanic_rla = mean(MHIrla, na.rm = TRUE),
        mean_White_math = mean(MWHmath, na.rm = TRUE),
        mean_White_rla = mean(MWHrla, na.rm = TRUE),
        mean_Disabled_math = mean(CWDmath, na.rm = TRUE),
        mean_Disabled_rla = mean(CWDrla, na.rm = TRUE)
    ) 

dat <- dat %>% filter(!STNAM == "BUREAU OF INDIAN AFFAIRS" & !STNAM == "stnam" & !STNAM == "PUERTO RICO") 

regions <- import(here("data","us_census_bureau_regions_and_divisions.csv")) %>%
    rename(state = `State Code`, `STNAM` = State)

df_regions <- regex_inner_join(dat, regions,ignore_case = TRUE) 

df_pivot <- df_regions %>% 
    pivot_longer(
        cols = starts_with("mean_"),
        names_to = c("Subgroup", "test"),
        values_to = "mean_pct",
        names_sep = "_",
        values_drop_na = TRUE,
        names_repair = "check_unique",
        names_prefix = "mean_"
    ) 

df_pivot$STNAM.y <- as.factor(df_pivot$STNAM.y)
df_pivot$Subgroup <- as.factor(df_pivot$Subgroup)
#Plot showing mean test score by subgroup and test

df_pivot %>%
    ggplot(aes(x = mean_pct,
                         y = fct_reorder(STNAM.y, mean_pct),
                         fill = mean_pct,
                         color = test)) +
    geom_point(size = .8,
                         alpha = .7) +
    facet_wrap(~Subgroup) +
    guides(fill = "none") +
    scale_color_discrete(l = 45,
                                             name = "Test",
                                             labels = c("Math", "Reading")) +
    labs(title = 'Mean percentage of students that scored at or above proficient',
             x = 'Mean percent proficient',
             y = 'State') +
    theme_minimal()

#Plot showing mean test score by subgroup and test by state

df_pivot %>%   
    ggplot(aes(x = mean_pct, 
                         y = fct_reorder(Subgroup, mean_pct))) +
    geom_jitter(aes(color = test),
                            size = .9,
                            alpha = .8) +
    facet_wrap(~STNAM.y) +
    scale_color_discrete(l = 70,
                                             name = "Test",
                                             labels = c("Math", "Reading")) +
    labs(title = 'Mean percentage of students that scored at or above proficient',
             x = 'Mean percent proficient',
             y = 'Group') 

#Plot showing mean test score by subgroup and test by region

ggplot(df_pivot, aes(x = mean_pct, 
                                         y = fct_reorder(Subgroup, mean_pct), 
                                         fill = Region,
                                         alpha = .7)) +
    guides(alpha = "none") +
    geom_density_ridges(rel_min_height = 0.005) +
    theme_minimal() +
    labs(title = 'Mean percentage of students that scored at or above proficient',
             x = 'Mean percent proficient',
             y = 'Group') 

#Trying to figure out how to show test scores separated by Gender but the plot won't create separate values for each

df_pivot %>%
    filter(Subgroup %in% c("Male", "Female")) %>%
    group_by(Subgroup) %>%
    ggplot(aes(x = mean_pct,
                         y = fct_reorder(STNAM.y, mean_pct),
                         fill = stat(x))) +
    geom_density_ridges_gradient(scale = 2, size = 0.3, rel_min_height = 0.001) +
    scale_fill_viridis_c(name = "%", option = "C") +
    coord_cartesian(clip = "off") +
    theme_ridges(grid = FALSE) +
    labs(title = 'Mean percentage of students that scored at or above proficient by gender',
             x = 'Mean percent proficient',
             y = 'State')

#Maps

us <- usa_sf()

us <- rename(us, state = iso_3166_2)

df_geo <- inner_join(df_pivot, us, by = "state")

dat2 <- full_join(df_pivot, df_geo)

#Math only
dat2 %>% 
    filter(test == "math") %>% 
    ggplot(aes(geometry = geometry, 
                         fill = mean_pct,
                         color = test), 
                 alpha = 0.9) + 
    facet_wrap(~Subgroup) +
    geom_sf(color = "white", size = 0) +
    guides(color = "none") +
    scale_fill_viridis(name = "%",
                                         breaks = c(0, 20, 40, 60, 80)) +
    labs(title = "Mean percentage of students that scored at or above proficient",
             subtitle = "Math",
             caption = "Source: U.S. Department of Education") +
    theme_void()

#Reading only
dat2 %>% 
    filter(test == "rla") %>% 
    ggplot(aes(geometry = geometry, 
                         fill = mean_pct,
                         color = test), 
                 alpha = 0.9) + 
    facet_wrap(~Subgroup) +
    geom_sf(color = "white", size = 0) +
    guides(color = "none") +
    scale_fill_viridis(name = "%",
                                         breaks = c(0, 20, 40, 60, 80)) +
    labs(title = "Mean percentage of students that scored at or above proficient",
             subtitle = "Reading",
             caption = "Source: U.S. Department of Education") +
    theme_void()

#combined map
ggplot(data = dat2, aes(geometry = geometry, 
                                                fill = mean_pct,
                                                color = test), 
             alpha = 0.9) + 
    facet_wrap(~Subgroup) +
    geom_sf(color = "white", size = 0) +
    guides(color = "none") +
    scale_fill_viridis(name = "%",
                                         breaks = c(0, 20, 40, 60, 80)) +
    labs(title = "Mean percentage of students that scored at or above proficient",
             subtitle = "Across Math and Reading",
             caption = "Source: U.S. Department of Education") +
    theme_void()

Data Visualization 2

Ksenia

Research Question:

What drives current expenditure on education? How is funding allocated by state and how do the funding allocations correlate with eh student performance (test scores)? Does cross-state variation in expenditure explain the cross-state variation in education outcomes? More specifically, I address the following questions:

– What are the gross amounts of total expenditure and how do they vary by state? - What amount of the total expenditure accounts for instruction, textbooks & special education? How do those allocations vary by state? - Does increased spending on any of the categories correlate with increased students’ performance?
- Can higher teachers’ salaries result in better test scores? (i.e. Does the spending on Instruction positively correlate with performance on standardized tests) - What amount of funding is spent on special education and how that correlates with performance of students with disabilities?

Not addressed at this point (yet?): - How do those percentages/ gross amounts correlate with the diversity of the student population/portion of white students in school?

Datasets:

  • EDFacts_rla_achievement_lea_2010_2019
  • EDFacts_math_achievement_lea_2010_2019
  • NCES_CCD_fiscal_district_2010
skim(fiscal2010)
Data summary
Name fiscal2010
Number of rows 18247
Number of columns 260
_______________________
Column type frequency:
character 129
numeric 130
POSIXct 1
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
AGCHRT 0 1 1 1 0 4 0
CBSA 0 1 1 5 0 939 0
CENSUSID 0 1 1 14 0 14842 0
CONUM 0 1 5 5 0 3129 0
CSA 0 1 1 3 0 125 0
FIPST 0 1 2 2 0 51 0
FL_19H 0 1 1 1 0 4 0
FL_21F 0 1 1 1 0 4 0
FL_31F 0 1 1 1 0 4 0
FL_41F 0 1 1 1 0 4 0
FL_61V 0 1 1 1 0 3 0
FL_66V 0 1 1 1 0 4 0
FL_A07 0 1 1 1 0 4 0
FL_A08 0 1 1 1 0 3 0
FL_A09 0 1 1 1 0 3 0
FL_A11 0 1 1 1 0 3 0
FL_A13 0 1 1 1 0 3 0
FL_A15 0 1 1 1 0 4 0
FL_A20 0 1 1 1 0 4 0
FL_A40 0 1 1 1 0 4 0
FL_B10 0 1 1 1 0 3 0
FL_B11 0 1 1 1 0 3 0
FL_B12 0 1 1 1 0 4 0
FL_B13 0 1 1 1 0 3 0
FL_C01 0 1 1 1 0 4 0
FL_C04 0 1 1 1 0 4 0
FL_C05 0 1 1 1 0 3 0
FL_C06 0 1 1 1 0 3 0
FL_C07 0 1 1 1 0 4 0
FL_C08 0 1 1 1 0 3 0
FL_C09 0 1 1 1 0 4 0
FL_C10 0 1 1 1 0 3 0
FL_C11 0 1 1 1 0 3 0
FL_C12 0 1 1 1 0 3 0
FL_C13 0 1 1 1 0 4 0
FL_C14 0 1 1 1 0 4 0
FL_C15 0 1 1 1 0 3 0
FL_C16 0 1 1 1 0 3 0
FL_C17 0 1 1 1 0 3 0
FL_C19 0 1 1 1 0 3 0
FL_C20 0 1 1 1 0 4 0
FL_C24 0 1 1 1 0 3 0
FL_C25 0 1 1 1 0 4 0
FL_C35 0 1 1 1 0 3 0
FL_C36 0 1 1 1 0 4 0
FL_C38 0 1 1 1 0 4 0
FL_C39 0 1 1 1 0 4 0
FL_D11 0 1 1 1 0 4 0
FL_D23 0 1 1 1 0 4 0
FL_E07 0 1 1 1 0 5 0
FL_E08 0 1 1 1 0 5 0
FL_E09 0 1 1 1 0 5 0
FL_E11 0 1 1 1 0 4 0
FL_E13 0 1 1 1 0 5 0
FL_E17 0 1 1 1 0 5 0
FL_F12 0 1 1 1 0 4 0
FL_G15 0 1 1 1 0 4 0
FL_HE1 0 1 1 1 0 4 0
FL_HE2 0 1 1 1 0 4 0
FL_HR1 0 1 1 1 0 4 0
FL_I86 0 1 1 1 0 4 0
FL_K09 0 1 1 1 0 5 0
FL_K10 0 1 1 1 0 4 0
FL_K11 0 1 1 1 0 4 0
FL_L12 0 1 1 1 0 3 0
FL_M12 0 1 1 1 0 3 0
FL_MEMBERSCH 0 1 1 1 0 3 0
FL_Q11 0 1 1 1 0 4 0
FL_T02 0 1 1 1 0 4 0
FL_T06 0 1 1 1 0 4 0
FL_T09 0 1 1 1 0 3 0
FL_T15 0 1 1 1 0 3 0
FL_T40 0 1 1 1 0 3 0
FL_T99 0 1 1 1 0 4 0
FL_U11 0 1 1 1 0 3 0
FL_U22 0 1 1 1 0 4 0
FL_U30 0 1 1 1 0 3 0
FL_U50 0 1 1 1 0 4 0
FL_U97 0 1 1 1 0 4 0
FL_V10 0 1 1 1 0 5 0
FL_V11 0 1 1 1 0 4 0
FL_V12 0 1 1 1 0 4 0
FL_V13 0 1 1 1 0 3 0
FL_V14 0 1 1 1 0 4 0
FL_V15 0 1 1 1 0 3 0
FL_V16 0 1 1 1 0 4 0
FL_V17 0 1 1 1 0 3 0
FL_V18 0 1 1 1 0 4 0
FL_V21 0 1 1 1 0 3 0
FL_V22 0 1 1 1 0 4 0
FL_V23 0 1 1 1 0 4 0
FL_V24 0 1 1 1 0 5 0
FL_V29 0 1 1 1 0 3 0
FL_V30 0 1 1 1 0 5 0
FL_V32 0 1 1 1 0 3 0
FL_V33 0 1 1 1 0 3 0
FL_V37 0 1 1 1 0 4 0
FL_V38 0 1 1 1 0 5 0
FL_V40 0 1 1 1 0 4 0
FL_V45 0 1 1 1 0 5 0
FL_V60 0 1 1 1 0 4 0
FL_V65 0 1 1 1 0 4 0
FL_V70 0 1 1 1 0 3 0
FL_V75 0 1 1 1 0 3 0
FL_V80 0 1 1 1 0 4 0
FL_V85 0 1 1 1 0 4 0
FL_V90 0 1 1 1 0 5 0
FL_V91 0 1 1 1 0 4 0
FL_V92 0 1 1 1 0 3 0
FL_V93 0 1 1 1 0 4 0
FL_W01 0 1 1 1 0 4 0
FL_W31 0 1 1 1 0 4 0
FL_W61 0 1 1 1 0 4 0
FL_Z32 0 1 1 1 0 4 0
FL_Z33 0 1 1 1 0 4 0
FL_Z34 0 1 1 1 0 5 0
FL_Z35 0 1 1 1 0 4 0
FL_Z36 0 1 1 1 0 3 0
FL_Z37 0 1 1 1 0 4 0
FL_Z38 0 1 1 1 0 3 0
FILEURL 0 1 48 48 0 1 0
GSHI 0 1 1 2 0 16 0
GSLO 0 1 1 2 0 16 0
LEAID 0 1 7 7 0 18247 0
NAME 0 1 3 60 0 17717 0
SCHLEV 0 1 1 2 0 7 0
STABBR 0 1 2 2 0 51 0
STNAME 0 1 4 20 0 51 0
PIPELINE 0 1 29 29 0 1 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
A07 0 1 62321.43 402543.33 -2 0 0 9000.0 20229000 ▇▁▁▁▁
A08 0 1 5939.67 128277.93 -2 0 0 0.0 15564000 ▇▁▁▁▁
A09 0 1 364123.59 1144107.53 -2 3000 79000 319000.0 47205000 ▇▁▁▁▁
A11 0 1 10957.26 85439.61 -2 0 0 0.0 3510000 ▇▁▁▁▁
A13 0 1 211909.26 1823605.27 -2 0 8000 97000.0 171949000 ▇▁▁▁▁
A15 0 1 8501.85 191879.04 -2 0 0 0.0 14939000 ▇▁▁▁▁
A20 0 1 91805.46 808818.77 -2 0 0 3000.0 49072000 ▇▁▁▁▁
A40 0 1 41828.48 293969.27 -2 0 0 9000.0 19348000 ▇▁▁▁▁
B10 0 1 68834.01 917277.92 -2 0 0 0.0 59398000 ▇▁▁▁▁
B11 0 1 19931.51 259028.50 -2 0 0 0.0 28169000 ▇▁▁▁▁
B12 0 1 5454.66 56806.67 -2 0 0 0.0 3111000 ▇▁▁▁▁
B13 0 1 162690.86 2006666.21 -2 0 0 24000.0 213069000 ▇▁▁▁▁
C01 0 1 9904463.16 56726682.92 -2 481000 2407000 7717500.0 6016802000 ▇▁▁▁▁
C04 0 1 177204.10 1791292.67 -2 0 0 0.0 139157000 ▇▁▁▁▁
C05 0 1 918167.93 12489751.14 -2 0 0 331000.0 1445472000 ▇▁▁▁▁
C06 0 1 255034.54 3212965.61 -2 0 0 47000.0 262095000 ▇▁▁▁▁
C07 0 1 33335.24 848165.31 -2 0 0 0.0 52887000 ▇▁▁▁▁
C08 0 1 23623.02 667675.15 -2 0 0 0.0 68096000 ▇▁▁▁▁
C09 0 1 43071.09 351589.50 -2 0 0 0.0 16140000 ▇▁▁▁▁
C10 0 1 28732.95 385107.93 -2 0 2000 12000.0 42871000 ▇▁▁▁▁
C11 0 1 340673.71 3307919.59 -2 0 0 0.0 289060000 ▇▁▁▁▁
C12 0 1 219562.73 1235888.02 -2 0 0 68000.0 76286000 ▇▁▁▁▁
C13 0 1 1467624.72 15568842.66 -2 1000 76000 506000.0 1684314000 ▇▁▁▁▁
C14 0 1 928407.42 8605443.22 -2 23000 146000 508000.0 777993000 ▇▁▁▁▁
C15 0 1 783204.43 4229045.40 -2 0 69000 473000.0 257678000 ▇▁▁▁▁
C16 0 1 91089.01 662844.07 -2 0 0 43000.0 58822000 ▇▁▁▁▁
C17 0 1 13732.90 128064.08 -2 0 0 4500.0 10177000 ▇▁▁▁▁
C19 0 1 34663.02 223558.75 -2 0 0 2000.0 15750000 ▇▁▁▁▁
C20 0 1 1216737.06 6249021.27 -2 1000 178000 753000.0 394540000 ▇▁▁▁▁
C24 0 1 198884.54 5846320.83 -2 0 0 0.0 729737000 ▇▁▁▁▁
C25 0 1 658136.53 4121046.74 -2 11000 118000 399500.0 321606000 ▇▁▁▁▁
C35 0 1 111686.04 1947391.23 -2 0 0 0.0 208110000 ▇▁▁▁▁
C36 0 1 141945.32 4773984.42 -2 0 0 0.0 637078000 ▇▁▁▁▁
C38 0 1 631631.02 3267240.10 -2 0 0 161000.0 144110000 ▇▁▁▁▁
C39 0 1 31920.66 320989.51 -2 0 0 0.0 19306000 ▇▁▁▁▁
CCDNF 0 1 1.00 0.04 0 1 1 1.0 1 ▁▁▁▁▇
CENFILE 0 1 0.81 0.39 0 1 1 1.0 1 ▂▁▁▁▇
D11 0 1 603561.64 3194163.22 -2 0 0 208000.0 201357000 ▇▁▁▁▁
D23 0 1 442007.85 3606971.99 -2 0 0 46000.0 340982000 ▇▁▁▁▁
E07 0 1 1366785.35 6893020.79 -2 34000 227000 952000.0 479925000 ▇▁▁▁▁
E08 0 1 550224.54 1818703.71 -2 101000 283000 593000.0 144467000 ▇▁▁▁▁
E09 0 1 1559184.15 7539644.20 -2 102000 381000 1206000.0 528651000 ▇▁▁▁▁
E11 0 1 1071170.56 5492398.66 -2 50000 261000 830000.0 448671000 ▇▁▁▁▁
E13 0 1 17327081.34 127893156.30 -2 1171000 4198000 13455500.0 14936045000 ▇▁▁▁▁
E17 0 1 1578756.96 6042710.01 -2 46000 286000 1203000.0 333309000 ▇▁▁▁▁
F12 0 1 2583185.58 27822230.17 -2 0 37000 675000.0 3044559000 ▇▁▁▁▁
G15 0 1 181778.44 1733083.39 -2 0 0 0.0 106072000 ▇▁▁▁▁
HE1 0 1 1388857.91 10400685.61 -2 55000 289000 968000.0 1050214000 ▇▁▁▁▁
HE2 0 1 64137.90 490090.60 -2 0 0 24000.0 36903000 ▇▁▁▁▁
HR1 0 1 218646.58 2227187.59 -2 0 26000 115000.0 232282000 ▇▁▁▁▁
I86 0 1 971108.03 6437069.74 -2 0 64000 529000.0 439446000 ▇▁▁▁▁
K09 0 1 137397.72 665773.23 -2 0 18000 95000.0 48388000 ▇▁▁▁▁
K10 0 1 353665.65 1589523.52 -2 2000 65000 242000.0 61852000 ▇▁▁▁▁
K11 0 1 23581.48 246991.88 -2 0 0 0.0 12401000 ▇▁▁▁▁
L12 0 1 80185.74 1349349.83 -2 0 0 0.0 111938000 ▇▁▁▁▁
M12 0 1 11888.16 391698.39 -2 0 0 0.0 49058000 ▇▁▁▁▁
MEMBERSCH 0 1 2690.61 12815.67 -9 167 641 2083.5 1014020 ▇▁▁▁▁
Q11 0 1 702918.96 7421719.40 -2 0 33000 283000.0 634364000 ▇▁▁▁▁
T02 0 1 2639423.57 73594947.69 -2 -2 -2 -2.0 9073697000 ▇▁▁▁▁
T06 0 1 9271963.74 39717090.27 -2 -2 1187000 5982000.0 1818529000 ▇▁▁▁▁
T09 0 1 203461.80 3394275.08 -2 -2 0 0.0 161332000 ▇▁▁▁▁
T15 0 1 19918.86 255405.26 -2 -2 0 0.0 20748000 ▇▁▁▁▁
T40 0 1 99695.20 1287236.16 -2 -2 0 0.0 110682000 ▇▁▁▁▁
T99 0 1 74279.41 909702.46 -2 -2 0 0.0 58982000 ▇▁▁▁▁
TCAPOUT 0 1 3279609.48 29521711.94 -2 35000 285000 1333000.0 3151607000 ▇▁▁▁▁
TCURELSC 0 1 28436776.91 179053170.29 -2 2109000 7146000 22648500.0 19453219000 ▇▁▁▁▁
TCURINST 0 1 17327081.34 127893156.30 -2 1171000 4198000 13455500.0 14936045000 ▇▁▁▁▁
TCUROTH 0 1 1140166.50 5600778.27 -2 55500 283000 874000.0 448671000 ▇▁▁▁▁
TCURSSVC 0 1 9969528.76 48510265.33 -2 776000 2588000 8035000.0 4068503000 ▇▁▁▁▁
TFEDREV 0 1 4124828.42 25252763.77 -2 261000 899000 2680000.0 2047926000 ▇▁▁▁▁
TLOCREV 0 1 15170253.20 97409268.54 -2 499000 3031000 10780000.0 10600597000 ▇▁▁▁▁
TNONELSE 0 1 363810.50 3018302.79 -2 0 3000 109000.0 175673000 ▇▁▁▁▁
TOTALEXP 0 1 34140053.39 224183538.28 -2 2408000 8445000 26676000.0 24597709000 ▇▁▁▁▁
TOTALREV 0 1 33481814.83 199406458.00 -2 2434500 8493000 26767000.0 21023695000 ▇▁▁▁▁
TSTREV 0 1 14186732.90 85066587.68 -2 945000 3617000 11322500.0 8375172000 ▇▁▁▁▁
U11 0 1 20131.32 332556.14 -2 0 0 0.0 28720000 ▇▁▁▁▁
U22 0 1 108350.53 735676.31 -2 1000 12000 56000.0 73023000 ▇▁▁▁▁
U30 0 1 19000.78 222943.67 -2 0 0 0.0 19854000 ▇▁▁▁▁
U50 0 1 52941.97 586124.91 -2 0 0 13000.0 48929000 ▇▁▁▁▁
U97 0 1 619238.19 11118869.62 -2 6000 52000 244000.0 1421630000 ▇▁▁▁▁
V10 0 1 3998687.90 38204603.80 -2 186000 870000 3139000.0 4756409000 ▇▁▁▁▁
V11 0 1 1053954.52 4087112.12 -2 15000 176000 789500.0 243582000 ▇▁▁▁▁
V12 0 1 336373.22 1395789.52 -2 2000 51000 247000.0 83188000 ▇▁▁▁▁
V13 0 1 806004.12 4234643.26 -2 10000 118000 543000.0 321611000 ▇▁▁▁▁
V14 0 1 263026.59 1390817.64 -2 2000 37000 186000.0 108630000 ▇▁▁▁▁
V15 0 1 239975.18 825385.53 -2 34000 142000 269000.0 82474000 ▇▁▁▁▁
V16 0 1 92164.15 323598.91 -2 8000 42000 99000.0 25761000 ▇▁▁▁▁
V17 0 1 1107942.03 5242875.32 -2 63500 257000 835000.0 341063000 ▇▁▁▁▁
V18 0 1 370250.30 2069478.16 -2 15000 79000 289000.0 160978000 ▇▁▁▁▁
V21 0 1 978273.54 6346261.99 -2 31000 184000 681500.0 661475000 ▇▁▁▁▁
V22 0 1 388822.12 3028190.56 -2 8000 68000 276000.0 327418000 ▇▁▁▁▁
V23 0 1 416869.36 1827291.51 -2 0 46000 272500.0 68954000 ▇▁▁▁▁
V24 0 1 167106.71 771339.05 -2 0 12000 97000.0 34626000 ▇▁▁▁▁
V29 0 1 357825.95 2204270.87 -2 0 69000 259000.0 221534000 ▇▁▁▁▁
V30 0 1 131409.06 743125.50 -2 0 20000 89000.0 69482000 ▇▁▁▁▁
V32 0 1 4498.39 71129.59 -2 0 0 0.0 4338000 ▇▁▁▁▁
V33 0 1 2696.60 12832.19 -9 165 643 2092.0 1014020 ▇▁▁▁▁
V37 0 1 446569.69 2413022.59 -2 0 87000 315000.0 237834000 ▇▁▁▁▁
V38 0 1 187349.27 1238275.08 -2 0 29000 122000.0 117724000 ▇▁▁▁▁
V40 0 1 2712894.19 16354709.32 -2 187000 690000 2114500.0 1661029000 ▇▁▁▁▁
V45 0 1 1219745.94 8586494.05 -2 35000 269000 978500.0 1024981000 ▇▁▁▁▁
V60 0 1 62758.66 644426.67 -2 0 0 0.0 31334000 ▇▁▁▁▁
V65 0 1 6236.98 120315.98 -2 0 0 0.0 9815000 ▇▁▁▁▁
V70 0 1 201312.99 1906604.78 -2 0 0 43000.0 118338000 ▇▁▁▁▁
V75 0 1 110767.97 1647700.62 -2 0 0 0.0 148516000 ▇▁▁▁▁
V80 0 1 51729.23 615328.22 -2 0 0 0.0 40513000 ▇▁▁▁▁
V85 0 1 248.65 33609.35 -2 0 0 0.0 4540000 ▇▁▁▁▁
V90 0 1 981687.90 6017663.61 -2 43000 202000 676000.0 591836000 ▇▁▁▁▁
V91 0 1 180739.64 5702851.48 -2 0 0 0.0 738402000 ▇▁▁▁▁
V92 0 1 113014.75 3320447.49 -2 0 0 0.0 383908000 ▇▁▁▁▁
V93 0 1 132874.73 1178546.64 -2 0 8000 70000.0 115987000 ▇▁▁▁▁
W01 0 1 864911.54 8526711.15 -2 0 0 182000.0 701763000 ▇▁▁▁▁
W31 0 1 2592562.11 31381207.74 -2 0 0 132000.0 3650875000 ▇▁▁▁▁
W61 0 1 5791300.75 25187070.39 -2 79000 1284000 4379500.0 2178242000 ▇▁▁▁▁
WEIGHT 0 1 1.00 0.00 1 1 1 1.0 1 ▁▁▇▁▁
Z32 0 1 17275789.18 99544925.43 -2 1078000 4110000 13449000.0 10254583000 ▇▁▁▁▁
Z33 0 1 11689210.40 76409484.15 -2 719000 2755000 9035000.0 8613741000 ▇▁▁▁▁
Z34 0 1 6003717.44 46330908.53 -2 284000 1321000 4773000.0 5409968000 ▇▁▁▁▁
Z35 0 1 5979280.17 39123700.39 -2 0 497000 4110500.0 4011575000 ▇▁▁▁▁
Z36 0 1 1360212.70 14071446.23 -2 0 41000 776000.0 1713079000 ▇▁▁▁▁
Z37 0 1 234278.74 3988673.40 -2 0 0 89000.0 524807000 ▇▁▁▁▁
Z38 0 1 296060.84 1767268.94 -2 0 0 111000.0 78518000 ▇▁▁▁▁
_19H 0 1 21207890.62 143384108.88 -2 0 1330000 12291500.0 11615909000 ▇▁▁▁▁
_21F 0 1 2646400.13 39506333.52 -2 0 0 0.0 4342818000 ▇▁▁▁▁
_31F 0 1 1937930.41 16398282.10 -2 0 138000 925000.0 1210275000 ▇▁▁▁▁
_41F 0 1 21921404.30 163411365.20 -2 0 1347000 12785500.0 12642529000 ▇▁▁▁▁
_61V 0 1 502922.79 6044542.90 -2 0 0 0.0 625105000 ▇▁▁▁▁
_66V 0 1 437682.37 3431237.57 -2 0 0 0.0 230000000 ▇▁▁▁▁
_YEAR 0 1 10.00 0.00 10 10 10 10.0 10 ▁▁▇▁▁
YEAR 0 1 2010.00 0.00 2010 2010 2010 2010.0 2010 ▁▁▇▁▁

Variable type: POSIXct

skim_variable n_missing complete_rate min max median n_unique
DL_INGESTION_DATETIME 0 1 2021-09-02 13:02:24 2021-09-02 13:02:24 2021-09-02 13:02:24 1
regions <- import(here("data", "us_census_bureau_regions_and_divisions.csv"), setclass = "tbl_df") %>% 
  rename(`STNAME` = State,
         STABBR = `State Code`) 

# Total spendings by state
total <-fiscal2010 %>% 
  group_by(STNAME) %>%
  summarise(TOTALEXP = sum(TOTALEXP),
            V33 = sum(V33)) 

total2 <- left_join(total, regions, by = "STNAME")

#Change scale of total spending
total2$TOTALEXP_B  <- total2$TOTALEXP/1000000000

#calculating the total spending per student
total2 <- total2 %>% 
  mutate(spend_per_stu = TOTALEXP/V33,
         spend_per_stu_k = (TOTALEXP/V33)/1000)

#spending on instruction by state 
instruction <- fiscal2010 %>%
  group_by(STNAME) %>%
      summarise(E13 = sum(E13), 
                V33 = sum(V33))

instruction2 <- left_join(instruction, regions, by = "STNAME")

#Change scale of total spending on instruction
instruction2$E13_B  <- instruction2$E13/1000000000

#calculating instruction spending per student
instruction2 <- instruction2 %>% 
  mutate(instr_per_stu = E13/V33,
         instr_per_stu_k = (E13/V33)/1000)

#spending on Special Ed teachers
SpEd <- fiscal2010 %>%
  group_by(STNAME) %>%
      summarise(Z36 = sum(Z36), 
                V33 = sum(V33))

SpEd2 <- left_join(SpEd, regions, by = "STNAME")

#removing the states with no data on Special Ed spending
SpEd2 <- SpEd2[SpEd2$Z36 >= 0, ]

#Change scale of total spending on SPecial Ed
SpEd2$Z36_B  <- SpEd2$Z36/1000000000

#spending on textbooks by state 
textbooks <- fiscal2010 %>%
  group_by(STNAME) %>%
      summarise(V93 = sum(V93),
                V33 = sum(V33))
textbooks2 <- left_join(textbooks, regions, by = "STNAME")

#removing the states with no data on textbook spending
textbooks2 <- textbooks2[textbooks2$V93 >= 0, ]

#Change scale of total spending on textbooks
textbooks2$V93_M  <- textbooks2$V93/1000000

#calculating textbook spending per student
textbooks2 <- textbooks2 %>% 
  mutate(book_per_stu = V93/V33)
#getting reading and language arts data
rla <- rlalea_10 %>% 
  select(YEAR, 
         STNAM, 
         ALL_RLA00PCTPROF,
         MWH_RLA00PCTPROF,
         CWD_RLA00PCTPROF) %>% 
  rename(`STNAME` = STNAM) %>% 
  mutate(STNAME = str_to_title(STNAME), 
         (across(ALL_RLA00PCTPROF:CWD_RLA00PCTPROF, 
                 ~replace(., . %in% c("PS", 
                                      "n/a",    
                                      "LT50",   
                                      "LE5",    
                                      "LE20",   
                                      "LE10",   
                                      "GE99",   
                                      "GE95",   
                                      "GE90",   
                                      "GE80",   
                                      "GE50"), NA))))

#Mean rla scores for all students
rla <- rla %>% 
  tidyr::separate(ALL_RLA00PCTPROF, c("all_lower", "all_upper"), sep = "-") %>% 
  mutate(
    all_upper = ifelse(is.na(all_upper), all_lower, all_upper),
    all_lower = as.numeric(all_lower),
    all_upper = as.numeric(all_upper)
    ) %>% 
  rowwise() %>% 
  mutate(mean_rla_all = mean(c(all_lower, all_upper))) %>% 
  ungroup()

#Mean rla scores for white students
rla <- rla %>% 
  tidyr::separate(MWH_RLA00PCTPROF, c("wh_lower", "wh_upper"), sep = "-") %>% 
  mutate(
    wh_upper = ifelse(is.na(wh_upper), wh_lower, wh_upper),
    wh_lower = as.numeric(wh_lower),
    wh_upper = as.numeric(wh_upper)
    ) %>% 
  rowwise() %>% 
  mutate(meanf_rla_wh = mean(c(wh_lower, wh_upper))) %>% 
  ungroup()

# mean rla scores for students with disabilities
rla <- rla %>% 
  tidyr::separate(CWD_RLA00PCTPROF, c("dis_lower", "dis_upper"), sep = "-") %>% 
  mutate(
    dis_upper = ifelse(is.na(dis_upper), dis_lower, dis_upper),
    dis_lower = as.numeric(dis_lower),
    dis_upper = as.numeric(dis_upper)
    ) %>% 
  rowwise() %>% 
  mutate(mean_rla_dis = mean(c(dis_lower, dis_upper))) %>% 
  ungroup()

rla_new <- rla %>% 
  select(YEAR, 
         STNAME,
         contains("mean"))

#getting math data
math <- mathlea_10 %>% 
  select(YEAR, 
         STNAM, 
         ALL_MTH00PCTPROF,
         MWH_MTH00PCTPROF,
         CWD_MTH00PCTPROF) %>% 
  rename(`STNAME` = STNAM) %>%
  mutate(STNAME = str_to_title(STNAME), 
         (across(ALL_MTH00PCTPROF:CWD_MTH00PCTPROF, 
                 ~replace(., . %in% c("PS", 
                                      "n/a",    
                                      "LT50",   
                                      "LE5",    
                                      "LE20",   
                                      "LE10",   
                                      "GE99",   
                                      "GE95",   
                                      "GE90",   
                                      "GE80",   
                                      "GE50"), NA))))

#Mean math scores for all students
math <- math %>% 
  tidyr::separate(ALL_MTH00PCTPROF, c("all_lower", "all_upper"), sep = "-") %>% 
  mutate(
    all_upper = ifelse(is.na(all_upper), all_lower, all_upper),
    all_lower = as.numeric(all_lower),
    all_upper = as.numeric(all_upper)
    ) %>% 
  rowwise() %>% 
  mutate(mean_math_all = mean(c(all_lower, all_upper))) %>% 
  ungroup()

#Mean math scores for white students
math <- math %>% 
  tidyr::separate(MWH_MTH00PCTPROF, c("wh_lower", "wh_upper"), sep = "-") %>% 
  mutate(
    wh_upper = ifelse(is.na(wh_upper), wh_lower, wh_upper),
    wh_lower = as.numeric(wh_lower),
    wh_upper = as.numeric(wh_upper)
    ) %>% 
  rowwise() %>% 
  mutate(mean_math_wh = mean(c(wh_lower, wh_upper))) %>% 
  ungroup()

# mean mathscores for students with disabilities
math <- math %>% 
  tidyr::separate(CWD_MTH00PCTPROF, c("dis_lower", "dis_upper"), sep = "-") %>% 
  mutate(
    dis_upper = ifelse(is.na(dis_upper), dis_lower, dis_upper),
    dis_lower = as.numeric(dis_lower),
    dis_upper = as.numeric(dis_upper)
    ) %>% 
  rowwise() %>% 
  mutate(mean_math_dis = mean(c(dis_lower, dis_upper))) %>% 
  ungroup()

math_new <- math %>% 
  select(YEAR, 
         STNAME,
         contains("mean"))

scores <- full_join(math_new, 
                    rla_new, 
                              by = "STNAME")

#dataset with all the scores and the total spending on Education
scores_spending <- full_join(scores, 
                              total2,
                              by = "STNAME")

#dataset with all the scores and the total spending on Instruction
scores_instruction <- full_join(scores, 
                              instruction2,
                              by = "STNAME")

#dataset with all the scores and the total spending on Special Ed
scores_SpEd <- full_join(scores, 
                              SpEd2,
                              by = "STNAME")

#dataset with all the scores and the total spending on textbooks
scores_books <- full_join(scores, 
                              textbooks2,
                              by = "STNAME")
total2  %>%
  ggplot(aes(TOTALEXP_B, fct_reorder(STNAME, TOTALEXP_B))) +
  geom_col(aes(fill = Region),
           alpha = 0.9) +
  scale_fill_OkabeIto() + 
  scale_x_continuous(expand = c(0, 0), 
                     limits = c(0, 75),
                     breaks = c(0, 10, 20, 30, 40, 50, 60, 70), 
                     labels = c("0", "10 billion", "20 billion", "30 billion", "40 billion", "50 billion", "60 billion", "70 billion")) + 
  labs(title = "Total Spending on Education by State",
       y = "State",
       x = "Total Yearly Spending, in USD",
       caption = "Source: National Center for Education Statistics", 
       fill = "Region") + 
  geom_text(
    aes(TOTALEXP_B, STNAME, label = paste (round(TOTALEXP_B, 2), " B.")),
    nudge_x = 0.9,
    size = 2
  ) +
  theme_minimal() +
  theme(plot.title.position = "plot",
        panel.grid.major.y = element_blank(),
        panel.grid.minor.y = element_blank()) 

#An average spending by the region
total_rg_avg <- total2 %>% 
  group_by(Region) %>% 
  summarize(rg_avg = mean(TOTALEXP_B))

total_rg_avg  %>%
  ggplot(aes(rg_avg, fct_reorder(Region,rg_avg))) +
  geom_col(alpha = 0.9) +
  scale_fill_OkabeIto() + 
  scale_x_continuous(expand = c(0, 0), 
                     limits = c(0, 20),
                     breaks = c(0, 5, 10, 15), 
                     labels = c("0", "5 billion", "10 billion", "15 billion")) + 
  labs(title = "Average Total Spending on Education by Region",
       y = "Region",
       x = "Avergae Yearly Spending, in USD",
       caption = "Source: National Center for Education Statistics") + 
  theme_minimal() +
  theme(plot.title.position = "plot",
        panel.grid.major.y = element_blank(),
        panel.grid.minor.y = element_blank()) 

TODO: format ‘by region’ plot if keeping it

When looking at individual states, we can observe that California has the largest total spending on Education, followed by New York, and Texas. North Dakota and South Dakota yield the smallest total expensiture on education, with both states spending less $1.5B per year. The raw amounts of total spending don’t give us a full picture, because states vary by population size and the number of students enrolled in high schools. Thus, it is beneficial to look at the expenditure per single student.

#plotting total spending per student by state
total2  %>%
  ggplot(aes(spend_per_stu_k, fct_reorder(STNAME, spend_per_stu_k))) +
  geom_col(aes(fill = Region),
           alpha = 0.9) +
  scale_fill_OkabeIto() + 
  scale_x_continuous(expand = c(0, 0), 
                     limits = c(0, 30),
                     breaks = c(0, 5, 10, 15, 20, 25), 
                     labels = c("0", "5K", "10K", "15K", "20K", "25K")) + 
  labs(title = "Total Spending on Education Per Single Student by State",
       y = "State",
       x = "Total Yearly Spending Per Student, in USD",
       caption = "Source: National Center for Education Statistics", 
       fill = "Region") + 
  geom_text(
    aes(spend_per_stu_k, STNAME, label = paste (round(spend_per_stu_k, 2), " K")),
    nudge_x = 0.45,
    size = 2) +
  geom_vline(aes(xintercept = mean(total2$spend_per_stu_k)), 
             data = total2, 
             linetype = 'dashed',
             size = 0.6,
             alpha = 1, 
             color = "gray60") +
  geom_label(data = total2, 
             aes(x = mean(total2$spend_per_stu_k)+1.6, 
                 y = 10,
                 label = "Avg. Spending
Per Student 
in the US = 
$ 12,977",
                 fill = NULL), 
                 color = "gray60",
                 size = 4,
                 fontface = "italic",
                 label.size = NA)  +
  theme_minimal() +
  theme(plot.title.position = "plot",
        panel.grid.major.y = element_blank(),
        panel.grid.minor.y = element_blank()) 

#calculating an average spending per student by the region
total_rg_avg_stu <- total2 %>% 
  group_by(Region) %>% 
  summarize(rg_avg_stu = mean(spend_per_stu_k))

#plotting by region
total_rg_avg_stu  %>%
  ggplot(aes(rg_avg_stu, fct_reorder(Region,rg_avg_stu))) +
  geom_col(alpha = 0.9) +
  scale_fill_OkabeIto() + 
  scale_x_continuous(expand = c(0, 0), 
                     limits = c(0, 20),
                     breaks = c(0, 5, 10, 15), 
                     labels = c("0", "5K", "10K", "15K")) + 
  labs(title = "Average Total Spending on Education per Student by Region",
       y = "Region",
       x = "Avergae Yearly Spending per Student, in USD",
       caption = "Source: National Center for Education Statistics") + 
  theme_minimal() +
  theme(plot.title.position = "plot",
        panel.grid.major.y = element_blank(),
        panel.grid.minor.y = element_blank()) 

TO_DO: format the ‘by region’ plot, if keeping it TODO: figure out how to remove ’a’s from the legend boxes

District of Columbia spent the most total on a single student ($25,613) in 2010, followed by New York ($21,770) and Vermont ($19,541). All Northeastern states are in the top 15 states with highest spending per student, which is reflected in the second plot demonstrating that in Northeastern states, an average spending per single students is the highest and equals $17,194.

The three states with the lowest spending per students are Idaho ($7,798), Utah ($8,040), and Arizona ($8,999), which are all Western States. Compared to other regions, a Western state spends the least per student on average ($11,745). It can be observed in the plot above that all Western states except Wyoming and Alaska spend less on a single student that the national mean.

The observed patterns can also be vizualized in a map:

#Map visualization for the total spending per student by state
us2 <- usa_sf()
us2 <- rename(us2, STABBR = iso_3166_2)
total_map <- inner_join(total2, us2, by = "STABBR")

total_map %>% 
    ggplot(aes(geometry = geometry, 
                         fill = spend_per_stu_k,
                         color = "transparent"), 
                 alpha = 0.9) + 
    geom_sf(color = "white", size = 0.1) +
  guides(fill = guide_colorsteps(barwidth = 15,
                                 barheight = .5,
                                 title.position = "top",
                                 title.hjust = .5,
                                 title = "Total Sending\n[1000 US$ / Year]")) +
  scale_fill_viridis_c(option = "inferno",
                       breaks = c(0,5,10,15,20, 25, 30),
                       limits = c(0, 30),
                       begin = 0) +
    labs(title = "State total spendings on education per student",
             subtitle = "2010",
             caption = "Source: National Center for Education Statistics") +
  theme_void() + 
  theme(legend.position = "bottom",
        plot.title = element_text(hjust = .08),
        plot.subtitle = element_text(hjust = .17))

#Plotting the total Spending on Instruction by state
instruction2  %>%
  ggplot(aes(E13_B, fct_reorder(STNAME, E13_B))) +
  geom_col(aes(fill = Region),
           alpha = 0.9) +
  scale_fill_OkabeIto() + 
  scale_x_continuous(expand = c(0, 0), 
                     limits = c(0, 42),
                     breaks = c(0, 10, 20, 30, 40), 
                     labels = c("0", "10 billion", "20 billion", "30 billion", "40 billion")) + 
  labs(title = "Total Spending on Instruction by State",
       y = "State",
       x = "Total Yearly Spending, in USD",
       caption = "Source: National Center for Education Statistics", 
       fill = "Region") + 
  geom_text(
    aes(E13_B, STNAME, label = paste (round(E13_B, 2), " B.")),
    nudge_x = 0.85,
    size = 2
  ) +
  theme_minimal() +
  theme(plot.title.position = "plot",
        panel.grid.major.y = element_blank(),
        panel.grid.minor.y = element_blank()) 

#plotting spending on instruction per student by state 
instruction2  %>%
  ggplot(aes(instr_per_stu_k, fct_reorder(STNAME, instr_per_stu_k))) +
  geom_col(aes(fill = Region),
           alpha = 0.9) +
  scale_fill_OkabeIto() + 
  scale_x_continuous(expand = c(0, 0), 
                     limits = c(0, 20),
                     breaks = c(0, 5, 10, 15), 
                     labels = c("0", "5K", "10K", "15K")) + 
  labs(title = "Total Spending on Instruction Per Single Student by State",
       y = "State",
       x = "Total Yearly Spending on Instruction Per Student, in USD",
       caption = "Source: National Center for Education Statistics", 
       fill = "Region") + 
  geom_text(
    aes(instr_per_stu_k, STNAME, label = paste (round(instr_per_stu_k, 2), " K")),
    nudge_x = 0.45,
    size = 2) +
  geom_vline(aes(xintercept = mean(instruction2$instr_per_stu_k)), 
             data = instruction2, 
             linetype = 'dashed',
             size = 0.6,
             alpha = 1, 
             color = "gray60") +
  geom_label(data = instruction2, 
             aes(x = mean(instruction2$instr_per_stu_k)+1.1, 
                 y = 10,
                 label = "Avg. Spending
on Istruction
Per Student 
in the US = 
$ 6,531",
                 fill = NULL), 
                 color = "gray60",
                 size = 4,
                 fontface = "italic",
                 label.size = NA)  +
  theme_minimal() +
  theme(plot.title.position = "plot",
        panel.grid.major.y = element_blank(),
        panel.grid.minor.y = element_blank()) 

TODO: figure out how to remove ’a’s from the legend boxes

California, New York, and Texas are the leaders in the spending on instruction, which is to be expected given the states’ population sizes. However, while California has the highest total spending on education, New York spends the most total on Instruction. South Dakota, North Dakota, and District of Columbia spend the least total on Instruction.

When exploring the data for the spending on instruction per individual student, we see that New York spends the most ($12,693). That exceeds the total instructional spending per student of the two runner-up states, New Jersey with $9,706 and Vermont with $9,555. Such a drastic increase in New York’s spending can be correlated with the average teacher’s salary in that state. All Northeastern state are in the top 15 highest spenders, as was the case with the total spending per student. Interestingly, District of Columbia that spent the most total per student in 2010, is only in the 9th place when it comes to instructional expenses only; DC reported $8,194 in average instructional expenses per student. The observed patterns can also be vizualized in a map:

#Map visualization for the instructional spending per student by state
instr_map <- inner_join(instruction2, us2, by = "STABBR")

instr_map %>% 
    ggplot(aes(geometry = geometry, 
                         fill = instr_per_stu_k,
                         color = "transparent"), 
                 alpha = 0.9) + 
    geom_sf(color = "white", size = 0.1) +
  guides(fill = guide_colorsteps(barwidth = 15,
                                 barheight = .5,
                                 title.position = "top",
                                 title.hjust = .5,
                                 title = "Instructional Sending\n[1000 US$ / Year]")) +
  scale_fill_viridis_c(option = "inferno",
                       breaks = c(0, 3, 6, 9, 12, 15),
                       limits = c(0, 15),
                       begin = 0) +
    labs(title = "State instructional spendings per student",
             subtitle = "2010",
             caption = "Source: National Center for Education Statistics") +
  theme_void() + 
  theme(legend.position = "bottom",
        plot.title = element_text(hjust = .08),
        plot.subtitle = element_text(hjust = .17))

## biscale map
biscale <- 
  scores_spending %>% 
  group_by(STNAME) %>% 
  summarize(
    rla = mean(mean_rla_all, na.rm=TRUE),
    spend = mean(spend_per_stu_k, na.rm=TRUE)
  ) %>% 
  bi_class(x = rla, y = spend, style = "quantile", dim = 3)


names(biscale)[1]<-"full"
map_join<- biscale%>%left_join(statepop, by= "full") 


p2<-plot_usmap(data = map_join, values = "bi_class", labels = TRUE, label_color = "white")   +
  bi_scale_fill(pal = "DkCyan", dim = 3, guide = F) +
  scale_x_continuous(breaks = NULL) + 
  scale_y_continuous(breaks = NULL) +
  theme(
    plot.title = element_text(margin = margin(b = 8), 
                              color = "#ffffff",face = "bold",size = 9,
                              hjust = 0.5,
                              family = "Arial"),
    plot.subtitle = element_text(margin = margin(t=10,b = 25), 
                                 color = "#ffffff", size = 6, family = "Arial",
                                 hjust = 0.5),
    plot.caption =  element_text(margin = margin(t = 20), 
                                 color = "#ffffff", size = 5, family = "Arial",
                                 hjust = 0.95),
    axis.title.x = element_blank(),
    axis.title.y = element_blank(),
    legend.title = element_blank(),
    axis.text.x    = element_blank(),
    axis.text.y    = element_blank(),
    panel.background = element_blank(), 
    panel.grid.major = element_blank(),
    panel.grid.major.y = element_blank(),
    panel.grid.minor = element_blank(), 
    plot.background = element_rect(fill = "#f3f3f3", color = NA),
    panel.border = element_blank(),
    plot.margin = unit(c(1, 1, 1, 1), "cm"),
    axis.ticks = element_blank()
  ) 

legend_US<- 
  bi_legend(pal = "DkCyan",
            dim = 3,
            xlab = "Proficiency in RLA",
            ylab = "Spend total",
            size = 20) +
  theme(rect = element_rect(fill = "grey10"),
        panel.border = element_blank(),
        axis.text = element_blank(),
        plot.background = element_rect(fill = "#f3f3f3", color = NA),
        axis.title.x = element_text(size = 10,
                                    color = "#a1a1a1",
                                    face = "bold"),
        axis.title.y = element_text(size = 10,
                                    color = "#a1a1a1",
                                    face = "bold"),
        legend.text = element_text(size = 5),
        legend.text.align = 0)


ggdraw() +
  draw_plot(p2, 0, 0, 1, 1) +
  draw_plot(legend_US, 0, 0.1, 0.2, 0.2) +
  draw_label("Source:National Center for Education Statistics)", 
             color = "#a1a1a1", size = 7.5, angle = 0, x = 0.9, y = 0.05) +
  draw_label("In which states the total spending on Education per student positively affects the average performance in RLA?", 
             color = "#000000", size = 17, angle = 0, x =0.5, y = 0.97, fontface = "bold") +
  draw_label("Bivariate map shwoing the combination of the total spending on education per student and the average demonstrated proficiency in Reading and Langguage Arts", 
             color = "#000000", size = 14, angle = 0, x =0.5, y = 0.92) +
  theme(plot.background = element_rect(fill = "#f3f3f3", color = NA)) 

We can see that the increased total spending on a student person resulted in increased proficiency test scores in such states Illinois, Ohio, New York, Maryland, Connecticut, New Hampshire, and District of Columbia. Aligning with this trend are the states such as Arizona, Oklahoma, Missouri, Tennessee, Mississippi, and Florida, where the total spend on Education was the lowest relative to other states, and the students demonstrated the lowest scores in RLA. However, this trend is not categorical across all the states. States like Texas, Idaho, Utah, Colorado, and Georgia demonstrated relatively high RLA scores while they spend less per student. At the same time, Maine exhibits the opposite pattern: the high total spending on Education in those states negatively correlates with the students’ RLA proficiency.

TODO: do the comaprison of total spendings for math scores

Next, comparing math scores with spending on instruction only

biscale2 <- 
  scores_instruction %>% 
  group_by(STNAME) %>% 
  summarize(
    math = mean(mean_math_all, na.rm=TRUE),
    instr = mean(instr_per_stu_k, na.rm=TRUE)
  ) %>% 
  bi_class(x = math, y = instr, style = "quantile", dim = 3)


names(biscale2)[1]<-"full"
map_join2<- biscale2%>%left_join(statepop, by= "full") 


p3<-plot_usmap(data = map_join2, values = "bi_class", labels = TRUE, label_color = "white")   +
  bi_scale_fill(pal = "DkCyan", dim = 3, guide = F) +
  scale_x_continuous(breaks = NULL) + 
  scale_y_continuous(breaks = NULL) +
  theme(
    plot.title = element_text(margin = margin(b = 8), 
                              color = "#ffffff",face = "bold",size = 9,
                              hjust = 0.5,
                              family = "Arial"),
    plot.subtitle = element_text(margin = margin(t=10,b = 25), 
                                 color = "#ffffff", size = 6, family = "Arial",
                                 hjust = 0.5),
    plot.caption =  element_text(margin = margin(t = 20), 
                                 color = "#ffffff", size = 5, family = "Arial",
                                 hjust = 0.95),
    axis.title.x = element_blank(),
    axis.title.y = element_blank(),
    legend.title = element_blank(),
    axis.text.x    = element_blank(),
    axis.text.y    = element_blank(),
    panel.background = element_blank(), 
    panel.grid.major = element_blank(),
    panel.grid.major.y = element_blank(),
    panel.grid.minor = element_blank(), 
    plot.background = element_rect(fill = "#f3f3f3", color = NA),
    panel.border = element_blank(),
    plot.margin = unit(c(1, 1, 1, 1), "cm"),
    axis.ticks = element_blank()
  ) 

legend_US3<- 
  bi_legend(pal = "DkCyan",
            dim = 3,
            xlab = "Proficiency in Math",
            ylab = "Instructional Spend",
            size = 20) +
  theme(rect = element_rect(fill = "grey10"),
        panel.border = element_blank(),
        axis.text = element_blank(),
        plot.background = element_rect(fill = "#f3f3f3", color = NA),
        axis.title.x = element_text(size = 10,
                                    color = "#a1a1a1",
                                    face = "bold"),
        axis.title.y = element_text(size = 10,
                                    color = "#a1a1a1",
                                    face = "bold"),
        legend.text = element_text(size = 5),
        legend.text.align = 0)


ggdraw() +
  draw_plot(p3, 0, 0, 1, 1) +
  draw_plot(legend_US3, 0, 0.1, 0.2, 0.2) +
  draw_label("Source:National Center for Education Statistics)", 
             color = "#a1a1a1", size = 7.5, angle = 0, x = 0.9, y = 0.05) +
  draw_label("In which states the total spending on Instruction per student positively affects performance in Math?", 
             color = "#000000", size = 17, angle = 0, x =0.5, y = 0.97, fontface = "bold") +
  draw_label("Bivariate map shwoing the combination of the total spending on intrusction per student and the average demonstrated proficiency in Math", 
             color = "#000000", size = 14, angle = 0, x =0.5, y = 0.92) +
  theme(plot.background = element_rect(fill = "#f3f3f3", color = NA)) 

TODO: do somth with Wyoming and Alabama (no data for them)

We can observe the general tendency of positive correlation between Expenditure on Instruction and MAth proficiency. In Nebraska, New York, Pennsylvania, Maryland, District of Columbia, and Connecticut, increased spending on instruction results in increased math scores. In Oregon, Montana, Michigan, Indiana, and Ohia, the avergae spending on instruction matches the average math scores. Arizona, Oklahoma, New Mexico, Tennessee, Mississippi, report the lowest total spend on Instruction, which correlates with the relatively low demonstrated proficiency in math by students of those states . Interestingly, Texas and Idaho that demonstrated the high RLA scores despite small total spending, exhibit the same pattern here: hogh math scores despite low spending on Instruction. Other states that show the same pattern with instructional spending are Colorado, South Dakota, and North Carolina. On the flip side, Maine again exhibits the opposite pattern: the high instructional spending negatively correlates with the students’ math proficiency. Here, the same trend is observed in Minnesota and Hawaii.

TODO: do the comaprison of instruction spendings for rla scores

#Plotting the total Spending on SPecial Ed by state
SpEd2  %>%
  ggplot(aes(Z36_B, fct_reorder(STNAME, Z36_B))) +
  geom_col(aes(fill = Region),
           alpha = 0.9) +
  scale_fill_OkabeIto() + 
  scale_x_continuous(expand = c(0, 0), 
                     limits = c(0, 4),
                     breaks = c(0, 0.5, 1, 1.5, 2, 2.5, 3, 3.5), 
                     labels = c("0", "500 million", "1 billion", "1.5 billion", "2 billion", "2.5 billion", "3 billion", "3.5 billion"))+ 
  labs(title = "Total Spending on Special Ed Teachers by State",
       y = "State",
       x = "Total Yearly Spending on Special Ed, in USD",
       caption = "Source: National Center for Education Statistics", 
       fill = "Region") + 
  geom_text(
    aes(Z36_B, STNAME, label = paste (round(Z36_B, 2), " B.")),
    nudge_x = 0.1,
    size = 3
  ) +
  theme_minimal() +
  theme(plot.title.position = "plot",
        panel.grid.major.y = element_blank(),
        panel.grid.minor.y = element_blank()) 

biscale4 <- 
  scores_SpEd %>% 
  group_by(STNAME) %>% 
  summarize(
    math = mean(mean_math_dis, na.rm=TRUE),
    SpEd = mean(Z36, na.rm=TRUE)
  ) %>% 
  bi_class(x = math, y = SpEd, style = "quantile", dim = 3)


names(biscale4)[1]<-"full"
map_join4<- biscale4%>%left_join(statepop, by= "full") 


p4<-plot_usmap(data = map_join4, values = "bi_class", labels = TRUE, label_color = "white")   +
  bi_scale_fill(pal = "DkCyan", dim = 3, guide = F) +
  scale_x_continuous(breaks = NULL) + 
  scale_y_continuous(breaks = NULL) +
  theme(
    plot.title = element_text(margin = margin(b = 8), 
                              color = "#ffffff",face = "bold",size = 9,
                              hjust = 0.5,
                              family = "Arial"),
    plot.subtitle = element_text(margin = margin(t=10,b = 25), 
                                 color = "#ffffff", size = 6, family = "Arial",
                                 hjust = 0.5),
    plot.caption =  element_text(margin = margin(t = 20), 
                                 color = "#ffffff", size = 5, family = "Arial",
                                 hjust = 0.95),
    axis.title.x = element_blank(),
    axis.title.y = element_blank(),
    legend.title = element_blank(),
    axis.text.x    = element_blank(),
    axis.text.y    = element_blank(),
    panel.background = element_blank(), 
    panel.grid.major = element_blank(),
    panel.grid.major.y = element_blank(),
    panel.grid.minor = element_blank(), 
    plot.background = element_rect(fill = "#f3f3f3", color = NA),
    panel.border = element_blank(),
    plot.margin = unit(c(1, 1, 1, 1), "cm"),
    axis.ticks = element_blank()
  ) 

legend_US4<- 
  bi_legend(pal = "DkCyan",
            dim = 3,
            xlab = "Proficiency in Math",
            ylab = "Special Ed Spend",
            size = 20) +
  theme(rect = element_rect(fill = "grey10"),
        panel.border = element_blank(),
        axis.text = element_blank(),
        plot.background = element_rect(fill = "#f3f3f3", color = NA),
        axis.title.x = element_text(size = 10,
                                    color = "#a1a1a1",
                                    face = "bold"),
        axis.title.y = element_text(size = 10,
                                    color = "#a1a1a1",
                                    face = "bold"),
        legend.text = element_text(size = 5),
        legend.text.align = 0)


ggdraw() +
  draw_plot(p4, 0, 0, 1, 1) +
  draw_plot(legend_US4, 0, 0.1, 0.2, 0.2) +
  draw_label("Source:National Center for Education Statistics)", 
             color = "#a1a1a1", size = 7.5, angle = 0, x = 0.9, y = 0.05) +
  draw_label("In which states the total spending on Special Edcuation positively affects performance of children with disabilities in Math?", 
             color = "#000000", size = 14, angle = 0, x =0.5, y = 0.97, fontface = "bold") +
  draw_label("Bivariate map showing the combination of the total spending on Special Education and the average demonstrated proficiency in Math of children with disabilities", 
             color = "#000000", size = 12, angle = 0, x =0.5, y = 0.92) +
  theme(plot.background = element_rect(fill = "#f3f3f3", color = NA)) 

Generally, all the states that spend the most on special education report the highest permormance results on math tests.

TODO: deal with the states that have no data TODO: besides bivariate maps, include scatterplots to show all the correlations

#Plot of Total Spending on Textbooks by state
textbooks2  %>%
  ggplot(aes(V93_M, fct_reorder(STNAME, V93_M))) +
  geom_col(aes(fill = Region),
           alpha = 0.9) +
  scale_fill_OkabeIto() + 
  scale_x_continuous(expand = c(0, 0), 
                     limits = c(0, 350),
                     breaks = c(0, 50, 100, 150, 200, 250, 300, 350), 
                     labels = c("0", "50 million", "100 million", "150 million", "200 million", "250 million", "300 million", "350 million")) + 
  labs(title = "Total Spending on Textbook by State",
       y = "State",
       x = "Total Textbook Spending, in USD",
       caption = "Source: National Center for Education Statistics", 
       fill = "Region") + 
  geom_text(
    aes(V93_M, STNAME, label = paste (round(V93_M, 2), " M.")),
    nudge_x = 8,
    size = 2
  ) +
  theme_minimal() +
  theme(plot.title.position = "plot",
        panel.grid.major.y = element_blank(),
        panel.grid.minor.y = element_blank()) 

#plotting spending on textbooks per student by state 
textbooks2  %>%
  ggplot(aes(book_per_stu, fct_reorder(STNAME, book_per_stu))) +
  geom_col(aes(fill = Region),
           alpha = 0.9) +
  scale_fill_OkabeIto() + 
  scale_x_continuous(expand = c(0, 0), 
                     limits = c(0, 150),
                     breaks = c(0, 25, 50, 75, 100, 125, 150))+ 
                    # labels = c("0", "5K", "10K", "15K")) + 
  labs(title = "Total Spending on Textbooks Per Single Student by State",
       y = "State",
       x = "Total Yearly Spending on Textbooks Per Student, in USD",
       caption = "Source: National Center for Education Statistics", 
       fill = "Region") + 
  geom_text(
    aes(book_per_stu, STNAME, label = round(book_per_stu, 2)),
    nudge_x = 2.2,
    size = 3) +
  geom_vline(aes(xintercept = mean(textbooks2$book_per_stu)), 
             data = instruction2, 
             linetype = 'dashed',
             size = 0.6,
             alpha = 1, 
             color = "gray60") +
  geom_label(data = instruction2, 
             aes(x = mean(textbooks2$book_per_stu)+8, 
                 y = 10,
                 label = "Avg. Spending
on Textbooks
Per Student 
in the US = 
$ 60.01",
                 fill = NULL), 
                 color = "gray60",
                 size = 4,
                 fontface = "italic",
                 label.size = NA)  +
  theme_minimal() +
  theme(plot.title.position = "plot",
        panel.grid.major.y = element_blank(),
        panel.grid.minor.y = element_blank()) 

Data Visualization 3

Amy

Research Question:

What is the relationship between the local revenue of local education agencies and students’ literacy outcomes on statewide assessments in 2010? Additional areas of exploration included (a) average total local revenue and local revenue property taxes by state, (b) the relationship between outcomes and total local revenue vs. local revenue from property taxes, (c) the relationship between both types of funding and outcomes by state, and (d) relationship between both types of revenue and outcomes of student subgroups (e.g., race/ethnicity, disability status, language proficiency, SES).

Datasets:

  • EDFacts_rla_achievement_lea_2010_2019
  • NCES_CCD_fiscal_district_2010

Variables:

EDFacts_rla_achievement_lea_2010_2019

  • YEAR
  • STNAM
  • FIPST
  • LEAID
  • ALL_RLA00PCTPROF = Percentage of all students who scored at or above their state’s proficiency level on Reading/Language Arts.
  • MAM_RLA03PCTPROF = Percentage of Native American students that scored at or above proficient
  • MAS_RLA00PCTPROF = Percentage of Asian/Pacific Islander students that scored at or above proficient
  • MBL_RLA00PCTPROF = Percentage of Black students that scored at or above proficient
  • MHI_RLA00PCTPROF = Percentage of Hispanic students that scored at or above proficient
  • MTR_RLA00PCTPROF = Percentage of students with Two or More Races that scored at or above proficient
  • MWH_RLA03PCTPROF = Percentage of White students that scored at or above proficient
  • CWD_RLA00PCTPROF = Percentage of children with disabilities that scored at or above proficient
  • ECD_RLA00PCTPROF = Percentage of economically disadvantaged students that scored at or above proficient
  • LEP_RLA00PCTPROF = Percentage of limited English proficient students that scored at or above proficient

NCES_CCD_fiscal_district_2010:

  • NAME
  • STABBR
  • CENSUSID
  • V33 = LEA fall membership (i.e., number of students)
  • TOTALREV = LEA total revenue
  • TFEDREV = LEA total federal revenue
  • TSTREV = LEA total state revenue
  • TLOCREV = LEA total local revenue
  • T06 = LEA local revenue from property taxes

Cleaning and Wrangling

# Selected columns of interest. Filtered so that rows with suppressed values (e.g., -9, -2) for key variables of interest weren't included. 
viz3_fiscal2010 <- fiscal2010 %>% 
  select(LEAID, NAME, STABBR, CENSUSID, V33,
         TOTALREV, TFEDREV, TSTREV, TLOCREV, T06) %>% 
  filter(V33 > 0, TLOCREV > 0, T06 > 0) %>%
  rename_with(tolower) %>% 
  rename(totalstu = v33, tlocrevtaxes = t06) %>% 
  mutate(locrev_stu = tlocrev / totalstu,
         locrevtaxes_stu = tlocrevtaxes / totalstu)
# Narrowed the RLA 2010 file down to variables of interest. Selected percent proficient variables across all grades (00) for race ethnicity, disability, English Language Learner status, and economically disadvantaged subgroups. Transformed variable names to be lowercase, transformed state names to be title case, and replaced the suppressed values (e.g., PS, n/a, etc.) with NA.
viz3_rlalea00 <- rlalea_10 %>% 
  select(YEAR, 
         STNAM, 
         FIPST,
         LEAID, 
         ALL_RLA00PCTPROF, 
         MAM_RLA00PCTPROF,
         MAS_RLA00PCTPROF,
         MBL_RLA00PCTPROF,
         MHI_RLA00PCTPROF,
         MTR_RLA00PCTPROF, 
         MWH_RLA00PCTPROF,
         CWD_RLA00PCTPROF,
         ECD_RLA00PCTPROF,
         LEP_RLA00PCTPROF) %>% 
  rename_with(tolower)  %>% 
  mutate(stnam = str_to_title(stnam), 
         (across(all_rla00pctprof:lep_rla00pctprof, 
                 ~replace(., . %in% c("PS", 
                                      "n/a",    
                                      "LT50",   
                                      "LE5",    
                                      "LE20",   
                                      "LE10",   
                                      "GE99",   
                                      "GE95",   
                                      "GE90",   
                                      "GE80",   
                                      "GE50"), NA))))

# Next step was cleaning the percentage columns to change percentage ranges to average percentages. I used the method Daniel used on the course data webpage and applied it to all subgroups.

# All = across all students
viz3_rlalea00 <- viz3_rlalea00 %>% 
  tidyr::separate(all_rla00pctprof, c("all_lower", "all_upper"), sep = "-") %>% 
  mutate(
    all_upper = ifelse(is.na(all_upper), all_lower, all_upper),
    all_lower = as.numeric(all_lower),
    all_upper = as.numeric(all_upper)
    ) %>% 
  rowwise() %>% 
  mutate(meanpctprof_all = mean(c(all_lower, all_upper))) %>% 
  ungroup()

# mam = American Indian/Alaska Native
viz3_rlalea00 <- viz3_rlalea00 %>% 
  tidyr::separate(mam_rla00pctprof, c("mam_lower", "mam_upper"), sep = "-") %>% 
  mutate(
    mam_upper = ifelse(is.na(mam_upper), mam_lower, mam_upper),
    mam_lower = as.numeric(mam_lower),
    mam_upper = as.numeric(mam_upper)
    ) %>% 
  rowwise() %>% 
  mutate(meanpctprof_mam = mean(c(mam_lower, mam_upper))) %>% 
  ungroup()

# mas = Asian/Pacific Islander
viz3_rlalea00 <- viz3_rlalea00 %>% 
  tidyr::separate(mas_rla00pctprof, c("mas_lower", "mas_upper"), sep = "-") %>% 
  mutate(
    mas_upper = ifelse(is.na(mas_upper), mas_lower, mas_upper),
    mas_lower = as.numeric(mas_lower),
    mas_upper = as.numeric(mas_upper)
    ) %>% 
  rowwise() %>% 
  mutate(meanpctprof_mas = mean(c(mas_lower, mas_upper))) %>% 
  ungroup()

# mbl = Black
viz3_rlalea00 <- viz3_rlalea00 %>% 
  tidyr::separate(mbl_rla00pctprof, c("mbl_lower", "mbl_upper"), sep = "-") %>% 
  mutate(
    mbl_upper = ifelse(is.na(mbl_upper), mbl_lower, mbl_upper),
    mbl_lower = as.numeric(mbl_lower),
    mbl_upper = as.numeric(mbl_upper)
    ) %>% 
  rowwise() %>% 
  mutate(meanpctprof_mbl = mean(c(mbl_lower, mbl_upper))) %>% 
  ungroup()

# mhi = Hispanic/Latino
viz3_rlalea00 <- viz3_rlalea00 %>% 
  tidyr::separate(mhi_rla00pctprof, c("mhi_lower", "mhi_upper"), sep = "-") %>% 
  mutate(
    mhi_upper = ifelse(is.na(mhi_upper), mhi_lower, mhi_upper),
    mhi_lower = as.numeric(mhi_lower),
    mhi_upper = as.numeric(mhi_upper)
    ) %>% 
  rowwise() %>% 
  mutate(meanpctprof_mhi = mean(c(mhi_lower, mhi_upper))) %>% 
  ungroup()

# mtr = Multiracial
viz3_rlalea00 <- viz3_rlalea00 %>% 
  tidyr::separate(mtr_rla00pctprof, c("mtr_lower", "mtr_upper"), sep = "-") %>% 
  mutate(
    mtr_upper = ifelse(is.na(mtr_upper), mtr_lower, mtr_upper),
    mtr_lower = as.numeric(mtr_lower),
    mtr_upper = as.numeric(mtr_upper)
    ) %>% 
  rowwise() %>% 
  mutate(meanpctprof_mtr = mean(c(mtr_lower, mtr_upper))) %>% 
  ungroup()

# mwh = White
viz3_rlalea00 <- viz3_rlalea00 %>% 
  tidyr::separate(mwh_rla00pctprof, c("mwh_lower", "mwh_upper"), sep = "-") %>% 
  mutate(
    mwh_upper = ifelse(is.na(mwh_upper), mwh_lower, mwh_upper),
    mwh_lower = as.numeric(mwh_lower),
    mwh_upper = as.numeric(mwh_upper)
    ) %>% 
  rowwise() %>% 
  mutate(meanpctprof_mwh = mean(c(mwh_lower, mwh_upper))) %>% 
  ungroup()

# cwd = children with disabilities
viz3_rlalea00 <- viz3_rlalea00 %>% 
  tidyr::separate(cwd_rla00pctprof, c("cwd_lower", "cwd_upper"), sep = "-") %>% 
  mutate(
    cwd_upper = ifelse(is.na(cwd_upper), cwd_lower, cwd_upper),
    cwd_lower = as.numeric(cwd_lower),
    cwd_upper = as.numeric(cwd_upper)
    ) %>% 
  rowwise() %>% 
  mutate(meanpctprof_cwd = mean(c(cwd_lower, cwd_upper))) %>% 
  ungroup()

# ecd = economically disadvantaged
viz3_rlalea00 <- viz3_rlalea00 %>% 
  tidyr::separate(ecd_rla00pctprof, c("ecd_lower", "ecd_upper"), sep = "-") %>% 
  mutate(
    ecd_upper = ifelse(is.na(ecd_upper), ecd_lower, ecd_upper),
    ecd_lower = as.numeric(ecd_lower),
    ecd_upper = as.numeric(ecd_upper)
    ) %>% 
  rowwise() %>% 
  mutate(meanpctprof_ecd = mean(c(ecd_lower, ecd_upper))) %>% 
  ungroup()

# lep = limited English proficiency (English Language Learner)
viz3_rlalea00 <- viz3_rlalea00 %>% 
  tidyr::separate(lep_rla00pctprof, c("lep_lower", "lep_upper"), sep = "-") %>% 
  mutate(
    lep_upper = ifelse(is.na(lep_upper), lep_lower, lep_upper),
    lep_lower = as.numeric(lep_lower),
    lep_upper = as.numeric(lep_upper)
    ) %>% 
  rowwise() %>% 
  mutate(meanpctprof_lep = mean(c(lep_lower, lep_upper))) %>% 
  ungroup()

# Get ride of the "_lower" and "_upper" percentage columns since they won't be needed
viz3_rlalea00 <- viz3_rlalea00 %>% 
  select(year, 
         stnam, 
         fipst,
         leaid,
         contains("meanpctprof"))
# Pivoted the dataset longer to have a column for subgroup and a column for mean percentage proficient. 

viz3_rlalea00_long <- viz3_rlalea00 %>%
  pivot_longer(
        cols = contains("meanpctprof"),
        names_to = "subgroup",
        values_to = "meanpctprof",
        names_prefix = "meanpctprof_") 
# Joined the long file with the cleaned/narrowed fiscal data file. Used an inner join because I'm only interested in LEAs that have both student proficiency and fiscal data. 
viz3_rla00long_fiscal_2010 <- inner_join(viz3_rlalea00_long, 
                                         viz3_fiscal2010, 
                                         by = "leaid")

Potential Visualizations

These are the data visualizations I am thinking I will choose from for our final product. I’m not going to include all of these. My primary next step is to narrow them down. Please note that they are still a bit rough and need refinement (some more than others). Refinement plans include… finalizing color, trying out annotations and/or highlighting, and exploring alternate options for faceting by state. If I include the plots with fitted lines, I plan to update the colors and replace the legend with annotations.

First off, bar graphs. The first two summarize LEA local revenue and LEA local revenue from property taxes averaged by state. The third graph summarizes the average percentage of students scoring at/above proficient by state.

# Bar graph: Average LEA total local revenue ($ per student) by state
viz3_rla00long_fiscal_2010 %>% 
  filter(subgroup == "all")  %>% 
  group_by(stnam) %>% 
  summarize(mean_locrev_stu = mean(locrev_stu)) %>% 
  ggplot(aes(x = mean_locrev_stu, y = fct_reorder(stnam, mean_locrev_stu))) +
  geom_col(color = "white", alpha = .6) +
  scale_x_continuous(expand = c(0, 0),
                     breaks = c(0, 2500, 5000, 7500, 10000, 12500),
                     labels = scales::dollar) +
  labs(title = "Average Local Revenue of LEAs",
       y = "State",
       x = "Dollar per student",
       caption = "Source: National Center for Education Statistics, 2010") +
  theme_minimal() +
  theme(plot.title.position = "plot",
        panel.grid.major.y = element_blank(),
        panel.grid.minor.y = element_blank(),
        panel.grid.minor.x = element_blank())  

# Bar graph: Average LEA local revenue from property taxes ($ per student) by state
viz3_rla00long_fiscal_2010 %>% 
  filter(subgroup == "all")  %>% 
  group_by(stnam) %>% 
  summarize(mean_locrevtaxes_stu = mean(locrevtaxes_stu)) %>% 
  ggplot(aes(x = mean_locrevtaxes_stu, 
             y = fct_reorder(stnam, mean_locrevtaxes_stu))) +
  geom_col(color = "white", alpha = .6) +
  scale_x_continuous(expand = c(0, 0),
                    breaks = c(0, 2500, 5000, 7500, 10000),
                     labels = scales::dollar) +
  labs(title = "Average Local Revenue of LEAs from Property Taxes",
       y = "State",
       x = "Dollar per student",
       caption = "Source: National Center for Education Statistics, 2010") +
  theme_minimal() +
  theme(plot.title.position = "plot",
        panel.grid.major.y = element_blank(),
        panel.grid.minor.y = element_blank(),
        panel.grid.minor.x = element_blank())

# Bar graph of average RLA proficiency for each state
viz3_rla00long_fiscal_2010 %>% 
  filter(subgroup == "all")  %>% 
  group_by(stnam) %>% 
  summarize(mean_pctprof = mean(meanpctprof, na.rm = T)) %>% 
  ggplot(aes(x = mean_pctprof, y = fct_reorder(stnam, mean_pctprof))) +
  geom_col(color = "white", alpha = .6) +
  scale_x_continuous(expand = c(0, 0),
                    breaks = c(0, 20, 40, 60, 80),
                    labels = c("0%", "20%", "40%", "60%", "80%")) +
  labs(title = "Average Proficiency in Reading/Language Arts",
       subtitle = "Students in Grades 3 through HS",
       y = "State",
       x = "Average Percentage",
       caption = "Source: National Center for Education Statistics, 2010") +
  theme_minimal() +
  theme(plot.title.position = "plot",
        panel.grid.major.y = element_blank(),
        panel.grid.minor.y = element_blank(),
        panel.grid.minor.x = element_blank())  

Density ridges of local revenue from property taxes for LEAs by state. One thing I plan to do, if I include this plot, that I didn’t get to is sorting the states by mean revenue.

# Density ridges of local revenue from property taxes across LEAs by state
viz3_rla00long_fiscal_2010 %>% 
  filter(subgroup == "all")  %>% 
  ggplot(aes(x = locrevtaxes_stu, y = stnam)) +
    geom_density_ridges(fill = "cornflower blue", color = "white", alpha = .8) +
    theme_minimal() +
  labs(title = "Local Revenue from Property Taxes in LEAS of Each State",
       y = "State",
       x = "Dollar per student",
       caption = "Source: National Center for Education Statistics, 2010") +
  theme_minimal() +
  theme(plot.title.position = "plot") + 
  scale_x_continuous(expand = c(0, 0)) +
  coord_cartesian(xlim = c(0, 25000))

Scatterplots showing the relationship between types of revenue and average percentage of students scoring at/above proficient on statewide assessments of reading/language arts. Note that I used a log transformation of the x-axis to spread the points out. Without the transformation, the bulk of the points were clustered near the bottom of the range. I think that I could use highlighting and annotations for the lowest and highest points.

# Scatterplots: All students, LEA total local revenue ($ per stu) and LEA local revenue from property taxes by approx. pct proficient
viz3_rla00long_fiscal_2010 %>% 
  filter(subgroup == "all") %>% 
  ggplot(aes(x = locrev_stu, y = meanpctprof)) +
  geom_point(color = "gray30", fill = "gray30", alpha = .4) + 
  scale_x_log10(labels = scales::dollar) +
  labs(title = "Relationship between Local Revenue and RLA Proficiency",
       y = "Approximate Average Percent Proficient",
       x = "Dollar per Student",
       caption = "Source: National Center for Education Statistics, 2010") +
  theme_minimal() +
  theme(plot.title.position = "plot",
        panel.grid.minor.y = element_blank(),
        panel.grid.minor.x = element_blank()) 

viz3_rla00long_fiscal_2010 %>% 
  filter(subgroup == "all") %>% 
  ggplot(aes(x = locrevtaxes_stu, y = meanpctprof)) +
  geom_point(color = "gray30", fill = "gray30", alpha = .4) + 
  scale_x_log10(labels = scales::dollar) +
  labs(title = "Local Revenue from Property Taxes and RLA Proficiency",
       y = "Approximate Average Percent Proficient",
       x = "Dollar per Student",
       caption = "Source: National Center for Education Statistics, 2010") +
  theme_minimal() +
  theme(plot.title.position = "plot",
        panel.grid.minor.y = element_blank(),
        panel.grid.minor.x = element_blank())

Scatterplots faceted by state.

# Scatterplot: All students - relationship between local revenue ($ per student) and mean % proficient, faceted by state
viz3_rla00long_fiscal_2010 %>% 
  filter(subgroup == "all") %>% 
  ggplot(aes(x = locrev_stu, y = meanpctprof)) +
  facet_wrap(~stnam) +
  geom_point(color = "gray30", fill = "gray30", alpha = .4) + 
  scale_x_log10(labels = scales::dollar) +
  labs(title = "Total Local Revenue and RLA Proficiency",
       y = "Approximate Average Percent Proficient",
       x = "Dollar per Student",
       caption = "Source: National Center for Education Statistics, 2010") +
  theme_minimal() +
  theme(plot.title.position = "plot",
        panel.grid.minor.y = element_blank(),
        panel.grid.minor.x = element_blank()) 

# Scatterplot: All students - relationship between local revenue from property taxes ($ per student) and mean % proficient, faceted by state
viz3_rla00long_fiscal_2010 %>% 
  filter(subgroup == "all") %>% 
  ggplot(aes(x = locrevtaxes_stu, y = meanpctprof)) +
  facet_wrap(~stnam) +
  geom_point(color = "gray30", fill = "gray30", alpha = .4) + 
  scale_x_log10(labels = scales::dollar) +
  labs(title = "Local Revenue from Property Taxes and RLA Proficiency",
       y = "Approximate Average Percent Proficient",
       x = "Dollar per Student",
       caption = "Source: National Center for Education Statistics, 2010") +
  theme_minimal() +
  theme(plot.title.position = "plot",
        panel.grid.minor.y = element_blank(),
        panel.grid.minor.x = element_blank())

Scatterplots showing the relationship between revenue and outcomes, faceted by subgroup. These are still pretty rough (e.g., labels need work).

# Scatterplot: Relationship between local revenue ($ per student) and mean % proficient, faceted by subgroup
viz3_rla00long_fiscal_2010 %>% 
  ggplot(aes(x = locrev_stu, y = meanpctprof)) +
  facet_wrap(~subgroup) +
  geom_point(color = "gray30", fill = "gray30", alpha = .4) + 
  scale_x_log10(labels = scales::dollar) +
  labs(title = "Total Local Revenue and RLA Proficiency",
       y = "Approximate Average Percent Proficient",
       x = "Dollar per Student",
       caption = "Source: National Center for Education Statistics, 2010") +
  theme_minimal() +
  theme(plot.title.position = "plot",
        panel.grid.minor.y = element_blank(),
        panel.grid.minor.x = element_blank()) 

# Scatterplot: Relationship between local revenue from property taxes ($ per student) and mean % proficient, faceted by subgroup
viz3_rla00long_fiscal_2010 %>% 
  ggplot(aes(x = locrevtaxes_stu, y = meanpctprof)) +
  facet_wrap(~subgroup) +
  geom_point(color = "gray30", fill = "gray30", alpha = .4) +
  scale_x_log10(labels = scales::dollar) +
  labs(title = "Local Revenue from Property Taxes and RLA Proficiency",
       y = "Approximate Average Percent Proficient",
       x = "Dollar per Student",
       caption = "Source: National Center for Education Statistics, 2010") +
  theme_minimal() +
  theme(plot.title.position = "plot",
        panel.grid.minor.y = element_blank(),
        panel.grid.minor.x = element_blank())

Playing around with fitting lines. These are still rough. I need to work on the legend and want to try replacing it with annotations. Also could consider changing the color to highlight a specific group or pick a different color palette.

viz3_rla00long_fiscal_2010 %>% 
  filter(subgroup != "all") %>% 
  ggplot() +
  geom_smooth(method = lm, 
              se = F, 
              aes(x = locrevtaxes_stu, 
                  y = meanpctprof, 
                  color = subgroup)) +
  scale_x_log10(labels = scales::dollar) +
  labs(title = "Local Revenue from Property Taxes and RLA Proficiency",
       y = "Approximate Average Percent Proficient",
       x = "Dollar per Student",
       caption = "Source: National Center for Education Statistics, 2010") +
  theme_minimal() +
  theme(plot.title.position = "plot",
        panel.grid.minor.y = element_blank(),
        panel.grid.minor.x = element_blank())

Fitted lines faceted by state (also very rough).

viz3_rla00long_fiscal_2010 %>% 
  filter(subgroup != "all") %>% 
  ggplot() +
  geom_smooth(method = lm, se = F, aes(x = locrevtaxes_stu, y = meanpctprof, color = subgroup)) +
  facet_wrap(~stnam) +
  scale_x_log10(labels = scales::dollar) +
  labs(title = "Local Revenue from Property Taxes and RLA Proficiency",
       y = "Approximate Average Percent Proficient",
       x = "Dollar per Student",
       caption = "Source: National Center for Education Statistics, 2010") +
  theme_minimal() +
  theme(plot.title.position = "plot",
        panel.grid.minor.y = element_blank(),
        panel.grid.minor.x = element_blank())

Created maps displaying (a) average LEA local revenue in each state and (b) average LEA local revenue from property taxes in each state. I still need to fill in the missing states and want to try out different color palettes.

viz3_map_dfprep <- viz3_rla00long_fiscal_2010 %>% 
  filter(subgroup == "all") %>% 
  group_by(stnam) %>% 
  summarize(mean_locrev_stu = mean(locrev_stu),
            mean_locrevtaxes_stu = mean(locrevtaxes_stu))

viz3_us <- usa_sf() 

viz3_map_df <- left_join(viz3_map_dfprep, 
                      viz3_us, 
                      by = c("stnam" = "name"))

viz3_map_df %>% 
  ggplot(aes(geometry = geometry, fill = mean_locrev_stu)) +
  geom_sf(color = "white", size = 0) +
    scale_fill_viridis(option = "magma", 
                       name = "Dollar per student",
                                         breaks = c(0, 2500, 5000, 7500, 10000, 12500),
                                         labels = c("$0", 
                                                    "$2,500", 
                                                    "$5,000", 
                                                    "$7,500", 
                                                    "$10,000",
                                                    "$12,500")) +
  theme_void() +
  labs(title = "Average LEA Total Local Revenue",
       caption = "Source: National Center for Education Statistics, 2010") +
  theme(plot.title.position = "plot") 

viz3_map_df %>% 
  ggplot(aes(geometry = geometry, fill = mean_locrevtaxes_stu)) +
  geom_sf(color = "white", size = 0) +
    scale_fill_viridis(option = "magma", 
                       name = "Dollar per student",
                                         breaks = c(0, 2500, 5000, 7500, 10000),
                                         labels = c("$0", 
                                                    "$2,500", 
                                                    "$5,000", 
                                                    "$7,500", 
                                                    "$10,000")) +
  theme_void() +
  labs(title = "Average LEA Local Revenue from Property Taxes",
       caption = "Source: National Center for Education Statistics, 2010") +
  theme(plot.title.position = "plot") 

Other Data Visualizations

These are some of the preliminary visualizations I did that I don’t think I’m moving forward with. Because of this, refinement is minimal.

# Scatterplot: Total local LEA revenue from property taxes ($ per stu) x approx. pct proficient, color = subgroup
viz3_rla00long_fiscal_2010 %>% 
  filter(subgroup != "all") %>% 
  ggplot(aes(x = locrevtaxes_stu, y = meanpctprof, color = subgroup)) +
  geom_point(alpha = .4) + 
  scale_x_log10(labels = scales::dollar) +
  theme_minimal() +
  labs(title = "Revenue from property tax x meanpctprof, color by subgroup") +
  theme(plot.title.position = "plot",
        panel.grid.minor.y = element_blank(),
        panel.grid.minor.x = element_blank())

Fitted lines for specific states:

# fitted lines in a few specific states
viz3_rla00long_fiscal_2010 %>% 
  filter(stnam == "Montana") %>% 
  ggplot() +
  geom_smooth(method = lm, 
              se = F, 
              aes(x = locrevtaxes_stu, 
                  y = meanpctprof, 
                  color = subgroup)) +
  scale_x_log10(labels = scales::dollar) +
  theme_minimal() +
  theme(plot.title.position = "plot",
        panel.grid.minor.y = element_blank(),
        panel.grid.minor.x = element_blank())  +
  labs(title = "Montana")

viz3_rla00long_fiscal_2010 %>% 
  filter(stnam == "South Dakota") %>% 
  ggplot() +
  geom_smooth(method = lm, 
              se = F, 
              aes(x = locrevtaxes_stu, 
                  y = meanpctprof, 
                  color = subgroup)) +
  scale_x_log10(labels = scales::dollar) +
  theme_minimal() +
  theme(plot.title.position = "plot",
        panel.grid.minor.y = element_blank(),
        panel.grid.minor.x = element_blank()) +
  labs(title = "South Dakota")

viz3_rla00long_fiscal_2010 %>% 
  filter(stnam == "New Jersey") %>% 
  ggplot() +
  geom_smooth(method = lm, 
              se = F, 
              aes(x = locrevtaxes_stu, 
                  y = meanpctprof, 
                  color = subgroup)) +
  scale_x_log10(labels = scales::dollar) +
  theme_minimal() +
  theme(plot.title.position = "plot",
        panel.grid.minor.y = element_blank(),
        panel.grid.minor.x = element_blank()) +
  labs(title = "New Jersey")

# Fitting a line over data points for local revenue x meanpctprof by state
viz3_rla00long_fiscal_2010 %>% 
  filter(subgroup == "all") %>% 
  ggplot(aes(x = locrev_stu, y = meanpctprof)) +
  geom_smooth(method = "lm") +
  facet_wrap(~stnam) +
  geom_point(alpha = .1) + 
  scale_x_log10(labels = scales::dollar) 

# Scatterplot: Relationship between local revenue from property taxes ($ per student) and mean % proficient, faceted by state, color by subgroup 
viz3_rla00long_fiscal_2010 %>% 
  filter(subgroup != "all") %>% 
  ggplot(aes(x = locrev_stu, y = meanpctprof, color = subgroup)) +
  facet_wrap(~stnam) +
  geom_point(alpha = .4) + 
  scale_x_log10(labels = scales::dollar)

Scatterplots for each student subgroup between local revenue from property tax and % proficient, faceted by state. The subgroup is indicated in the title.

# Scatterplots: Relationship between local revenue from property taxes ($ per student) and mean % proficient, faceted by state for each subgroup 

viz3_rla00long_fiscal_2010 %>% 
  filter(subgroup == "mam") %>% 
  ggplot(aes(x = locrev_stu, y = meanpctprof)) +
  facet_wrap(~stnam) +
  geom_point(alpha = .4) + 
  scale_x_log10(labels = scales::dollar) +
  labs(title = "American Indian/Alaska Native")

viz3_rla00long_fiscal_2010 %>% 
  filter(subgroup == "mas") %>% 
  ggplot(aes(x = locrev_stu, y = meanpctprof)) +
  facet_wrap(~stnam) +
  geom_point(alpha = .4) + 
  scale_x_log10(labels = scales::dollar) +
  labs(title = "Asian/Pacific Islander")

viz3_rla00long_fiscal_2010 %>% 
  filter(subgroup == "mhi") %>% 
  ggplot(aes(x = locrev_stu, y = meanpctprof)) +
  facet_wrap(~stnam) +
  geom_point(alpha = .4) + 
  scale_x_log10(labels = scales::dollar) +
  labs(title = "Hispanic/Latino")

viz3_rla00long_fiscal_2010 %>% 
  filter(subgroup == "mbl") %>% 
  ggplot(aes(x = locrev_stu, y = meanpctprof)) +
  facet_wrap(~stnam) +
  geom_point(alpha = .4) + 
  scale_x_log10(labels = scales::dollar) +
  labs(title = "Black")

viz3_rla00long_fiscal_2010 %>% 
  filter(subgroup == "mwh") %>% 
  ggplot(aes(x = locrev_stu, y = meanpctprof)) +
  facet_wrap(~stnam) +
  geom_point(alpha = .4) + 
  scale_x_log10(labels = scales::dollar) +
  labs(title = "White")

viz3_rla00long_fiscal_2010 %>% 
  filter(subgroup == "mtr") %>% 
  ggplot(aes(x = locrev_stu, y = meanpctprof)) +
  facet_wrap(~stnam) +
  geom_point(alpha = .4) + 
  scale_x_log10(labels = scales::dollar) +
  labs(title = "Multiracial")

viz3_rla00long_fiscal_2010 %>% 
  filter(subgroup == "cwd") %>% 
  ggplot(aes(x = locrev_stu, y = meanpctprof)) +
  facet_wrap(~stnam) +
  geom_point(alpha = .4) + 
  scale_x_log10(labels = scales::dollar) +
  labs(title = "Students with Disabilities")

viz3_rla00long_fiscal_2010 %>% 
  filter(subgroup == "ecd") %>% 
  ggplot(aes(x = locrev_stu, y = meanpctprof)) +
  facet_wrap(~stnam) +
  geom_point(alpha = .4) + 
  scale_x_log10(labels = scales::dollar) +
  labs(title = "Economically Disadvantaged")

viz3_rla00long_fiscal_2010 %>% 
  filter(subgroup == "lep") %>% 
  ggplot(aes(x = locrev_stu, y = meanpctprof)) +
  facet_wrap(~stnam) +
  geom_point(alpha = .4) + 
  scale_x_log10(labels = scales::dollar) +
  labs(title = "Limited English Proficiency")